In [ ]:
import os
os.chdir("../data/raw")
os.getcwd()
Out[ ]:
'/root/restaurants/data/raw'
In [ ]:
import zipfile
with zipfile.ZipFile("dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("")

with zipfile.ZipFile("air_reserve.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("")
with zipfile.ZipFile("hpg_reserve.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("")

with zipfile.ZipFile("air_visit_data.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("")
with zipfile.ZipFile("air_store_info.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("")
with zipfile.ZipFile("hpg_store_info.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("")

with zipfile.ZipFile("date_info.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("")

with zipfile.ZipFile("store_id_relation.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("")
In [ ]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from copy import deepcopy
import random

import plotly.graph_objects as go
import plotly.express as px

air_reserve¶

In [ ]:
air_reserve = pd.read_csv("air_reserve.csv")
profile = ProfileReport(air_reserve, title="Profiling Report")
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[ ]:

In [ ]:
air_reserve.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92378 entries, 0 to 92377
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   air_store_id      92378 non-null  object
 1   visit_datetime    92378 non-null  object
 2   reserve_datetime  92378 non-null  object
 3   reserve_visitors  92378 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 2.8+ MB
In [ ]:
air_reserve['visit_datetime'] = pd.to_datetime(air_reserve['visit_datetime'])
air_reserve['reserve_datetime'] = pd.to_datetime(air_reserve['reserve_datetime'])

hpg_reserve¶

In [ ]:
hpg_reserve = pd.read_csv("hpg_reserve.csv")
profile = ProfileReport(hpg_reserve, title="Profiling Report")
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[ ]:

In [ ]:
hpg_reserve.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000320 entries, 0 to 2000319
Data columns (total 4 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   hpg_store_id      object
 1   visit_datetime    object
 2   reserve_datetime  object
 3   reserve_visitors  int64 
dtypes: int64(1), object(3)
memory usage: 61.0+ MB
In [ ]:
hpg_reserve['visit_datetime'] = pd.to_datetime(hpg_reserve['visit_datetime'])
hpg_reserve['reserve_datetime'] = pd.to_datetime(hpg_reserve['reserve_datetime'])
In [ ]:
hpg_reserve = hpg_reserve.rename(columns={"hpg_store_id": "store_id"})
air_reserve = air_reserve.rename(columns={"air_store_id": "store_id"})
reserve_data = pd.concat([air_reserve, hpg_reserve])

reserve_data
Out[ ]:
store_id visit_datetime reserve_datetime reserve_visitors
0 air_877f79706adbfb06 2016-01-01 19:00:00 2016-01-01 16:00:00 1
1 air_db4b38ebe7a7ceff 2016-01-01 19:00:00 2016-01-01 19:00:00 3
2 air_db4b38ebe7a7ceff 2016-01-01 19:00:00 2016-01-01 19:00:00 6
3 air_877f79706adbfb06 2016-01-01 20:00:00 2016-01-01 16:00:00 2
4 air_db80363d35f10926 2016-01-01 20:00:00 2016-01-01 01:00:00 5
... ... ... ... ...
2000315 hpg_2b293477ea4642fb 2017-05-31 20:00:00 2017-04-19 07:00:00 2
2000316 hpg_f922aeca48ca5a29 2017-05-31 20:00:00 2017-03-19 00:00:00 3
2000317 hpg_e9151de687b93aa3 2017-05-31 21:00:00 2017-04-14 20:00:00 49
2000318 hpg_fae6c96189b4a122 2017-05-31 21:00:00 2017-04-19 17:00:00 9
2000319 hpg_0b70de808b55ad1e 2017-05-31 23:00:00 2017-04-21 13:00:00 6

2092698 rows × 4 columns

visit data¶

In [ ]:
hpg_visit_data = hpg_reserve.drop(columns = ['reserve_datetime']).groupby(by = ["store_id", "visit_datetime"]).sum().reset_index()
hpg_visit_data = hpg_visit_data.rename(columns={"visit_datetime": "visit_date", "reserve_visitors": "visitors"})
profile = ProfileReport(hpg_visit_data, title="Profiling Report")
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[ ]:

In [ ]:
hpg_visit_data['visit_date'] = pd.to_datetime(hpg_visit_data['visit_date'])
hpg_visit_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1711380 entries, 0 to 1711379
Data columns (total 3 columns):
 #   Column      Dtype         
---  ------      -----         
 0   store_id    object        
 1   visit_date  datetime64[ns]
 2   visitors    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 39.2+ MB
In [ ]:
air_visit_data = pd.read_csv("air_visit_data.csv")
profile = ProfileReport(air_visit_data, title="Profiling Report")
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[ ]:

In [ ]:
air_visit_data['visit_date'] = pd.to_datetime(air_visit_data['visit_date'])
air_visit_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252108 entries, 0 to 252107
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   air_store_id  252108 non-null  object        
 1   visit_date    252108 non-null  datetime64[ns]
 2   visitors      252108 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 5.8+ MB
In [ ]:
hpg_visit_data = hpg_visit_data.rename(columns={"hpg_store_id": "store_id"})
air_visit_data = air_visit_data.rename(columns={"air_store_id": "store_id"})
visit_data = pd.concat([air_visit_data, hpg_visit_data])
visit_data['visit_date'] = visit_data['visit_date'].dt.date
visit_data
Out[ ]:
store_id visit_date visitors
0 air_ba937bf13d40fb24 2016-01-13 25
1 air_ba937bf13d40fb24 2016-01-14 32
2 air_ba937bf13d40fb24 2016-01-15 29
3 air_ba937bf13d40fb24 2016-01-16 22
4 air_ba937bf13d40fb24 2016-01-18 6
... ... ... ...
1711375 hpg_fffc097dce87af3e 2017-03-10 2
1711376 hpg_fffc097dce87af3e 2017-03-17 2
1711377 hpg_fffc097dce87af3e 2017-03-27 5
1711378 hpg_fffc097dce87af3e 2017-04-21 3
1711379 hpg_fffc097dce87af3e 2017-04-23 8

1963488 rows × 3 columns

air_store_info¶

In [ ]:
air_store_info = pd.read_csv("air_store_info.csv")
profile = ProfileReport(air_store_info, title="Profiling Report")
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[ ]:

hpg_store_info¶

In [ ]:
hpg_store_info = pd.read_csv("hpg_store_info.csv")
profile = ProfileReport(hpg_store_info, title="Profiling Report")
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[ ]:

In [ ]:
hpg_store_info = hpg_store_info.rename(columns={"hpg_store_id": "store_id", "hpg_genre_name": "genre_name", "hpg_area_name": "area_name"})
air_store_info = air_store_info.rename(columns={"air_store_id": "store_id", "air_genre_name": "genre_name", "air_area_name": "area_name"})
In [ ]:
store_info = pd.concat([air_store_info, hpg_store_info])
store_info
Out[ ]:
store_id genre_name area_name latitude longitude
0 air_0f0cdeee6c9bf3d7 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.695124 135.197853
1 air_7cc17a324ae5c7dc Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.695124 135.197853
2 air_fee8dcf4d619598e Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.695124 135.197853
3 air_a17f0778617c76e2 Italian/French Hyōgo-ken Kōbe-shi Kumoidōri 34.695124 135.197853
4 air_83db5aff8f50478e Italian/French Tōkyō-to Minato-ku Shibakōen 35.658068 139.751599
... ... ... ... ... ...
4685 hpg_c6ed353a48c7c93e Sichuan food Tōkyō-to Chiyoda-ku None 35.695780 139.768453
4686 hpg_a58d2423360b2fb4 Sichuan food Tōkyō-to Minato-ku Roppongi 35.666137 139.733978
4687 hpg_d2021704076cdd70 Shanghai food Tōkyō-to Chiyoda-ku None 35.695780 139.768453
4688 hpg_602472570d65e219 Spain/Mediterranean cuisine Tōkyō-to Toshima-ku None 35.730068 139.711742
4689 hpg_11801306ce8bfb4f Udon/Soba Osaka Prefecture Osaka None 34.701519 135.498859

5519 rows × 5 columns

date_info¶

In [ ]:
date_info = pd.read_csv("date_info.csv")
date_info
Out[ ]:
calendar_date day_of_week holiday_flg
0 2016-01-01 Friday 1
1 2016-01-02 Saturday 1
2 2016-01-03 Sunday 1
3 2016-01-04 Monday 0
4 2016-01-05 Tuesday 0
... ... ... ...
512 2017-05-27 Saturday 0
513 2017-05-28 Sunday 0
514 2017-05-29 Monday 0
515 2017-05-30 Tuesday 0
516 2017-05-31 Wednesday 0

517 rows × 3 columns

In [ ]:
date_info.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   calendar_date  517 non-null    object
 1   day_of_week    517 non-null    object
 2   holiday_flg    517 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 12.2+ KB
In [ ]:
visit_data
Out[ ]:
store_id visit_date visitors
0 air_ba937bf13d40fb24 2016-01-13 25
1 air_ba937bf13d40fb24 2016-01-14 32
2 air_ba937bf13d40fb24 2016-01-15 29
3 air_ba937bf13d40fb24 2016-01-16 22
4 air_ba937bf13d40fb24 2016-01-18 6
... ... ... ...
1711375 hpg_fffc097dce87af3e 2017-03-10 2
1711376 hpg_fffc097dce87af3e 2017-03-17 2
1711377 hpg_fffc097dce87af3e 2017-03-27 5
1711378 hpg_fffc097dce87af3e 2017-04-21 3
1711379 hpg_fffc097dce87af3e 2017-04-23 8

1963488 rows × 3 columns

In [ ]:
visit_data
Out[ ]:
store_id visit_date visitors
0 air_ba937bf13d40fb24 2016-01-13 25
1 air_ba937bf13d40fb24 2016-01-14 32
2 air_ba937bf13d40fb24 2016-01-15 29
3 air_ba937bf13d40fb24 2016-01-16 22
4 air_ba937bf13d40fb24 2016-01-18 6
... ... ... ...
1711375 hpg_fffc097dce87af3e 2017-03-10 2
1711376 hpg_fffc097dce87af3e 2017-03-17 2
1711377 hpg_fffc097dce87af3e 2017-03-27 5
1711378 hpg_fffc097dce87af3e 2017-04-21 3
1711379 hpg_fffc097dce87af3e 2017-04-23 8

1963488 rows × 3 columns

In [ ]:
visit_data = visit_data.rename(columns={"visit_date": "date"})
visit_data['date'] = visit_data['date'].astype("string")

date_info = date_info.rename(columns={"calendar_date": "date"})
date_info['date'] = date_info['date'].astype("string")
In [ ]:
data = pd.merge(visit_data, date_info, on="date")
data = pd.merge(data, store_info, on="store_id")
data['date'] = pd.to_datetime(data['date'])
profile = ProfileReport(data, title="Profiling Report")
profile
data
Out[ ]:
store_id date visitors day_of_week holiday_flg genre_name area_name latitude longitude
0 air_ba937bf13d40fb24 2016-01-13 25 Wednesday 0 Dining bar Tōkyō-to Minato-ku Shibakōen 35.658068 139.751599
1 air_ba937bf13d40fb24 2016-01-14 32 Thursday 0 Dining bar Tōkyō-to Minato-ku Shibakōen 35.658068 139.751599
2 air_ba937bf13d40fb24 2016-01-15 29 Friday 0 Dining bar Tōkyō-to Minato-ku Shibakōen 35.658068 139.751599
3 air_ba937bf13d40fb24 2016-01-16 22 Saturday 0 Dining bar Tōkyō-to Minato-ku Shibakōen 35.658068 139.751599
4 air_ba937bf13d40fb24 2016-01-18 6 Monday 0 Dining bar Tōkyō-to Minato-ku Shibakōen 35.658068 139.751599
... ... ... ... ... ... ... ... ... ...
982086 hpg_ffe96ca8c6ec10cf 2017-03-31 8 Friday 0 Italian Tōkyō-to Chūō-ku None 35.682504 139.773619
982087 hpg_ffe96ca8c6ec10cf 2017-04-05 4 Wednesday 0 Italian Tōkyō-to Chūō-ku None 35.682504 139.773619
982088 hpg_ffe96ca8c6ec10cf 2017-04-07 11 Friday 0 Italian Tōkyō-to Chūō-ku None 35.682504 139.773619
982089 hpg_ffe96ca8c6ec10cf 2017-04-15 3 Saturday 0 Italian Tōkyō-to Chūō-ku None 35.682504 139.773619
982090 hpg_ffe96ca8c6ec10cf 2017-04-24 16 Monday 0 Italian Tōkyō-to Chūō-ku None 35.682504 139.773619

982091 rows × 9 columns

In [ ]:
temp = deepcopy(data)
temp = temp.groupby(by=['holiday_flg']).count().reset_index().drop(columns=["date", "visitors", "day_of_week", "genre_name", "area_name", "latitude", "longitude"])
temp = temp.rename(columns={"store_id": "count"})

fig = px.bar(temp, x="holiday_flg", y="count",
             barmode='group',
             height=400)
fig.show()
In [ ]:
temp = deepcopy(data)
temp = temp.drop(columns=["store_id", "date", "holiday_flg", "genre_name", "area_name", "latitude", "longitude"])
temp = temp.groupby('day_of_week').sum().reset_index().sort_values("visitors")

fig = px.bar(temp, x='day_of_week', y='visitors',
             color='visitors',
             labels={'pop':'population of Canada'}, height=400)
fig.show()
In [ ]:
store_id_relation = pd.read_csv("store_id_relation.csv")
store_id_relation
Out[ ]:
air_store_id hpg_store_id
0 air_63b13c56b7201bd9 hpg_4bc649e72e2a239a
1 air_a24bf50c3e90d583 hpg_c34b496d0305a809
2 air_c7f78b4f3cba33ff hpg_cd8ae0d9bbd58ff9
3 air_947eb2cae4f3e8f2 hpg_de24ea49dc25d6b8
4 air_965b2e0cf4119003 hpg_653238a84804d8e7
... ... ...
145 air_b8925441167c3152 hpg_0cc0abae3a8b52de
146 air_f8233ad00755c35c hpg_098e4dd30e54fee6
147 air_6607fe3671242ce3 hpg_1004356dc8ab69ec
148 air_ccd19a5bc5573ae5 hpg_df22bf146d2c7e19
149 air_465bddfed3353b23 hpg_23c5945dc1a72454

150 rows × 2 columns

In [ ]:
air_store_info = air_store_info.rename(columns={"store_id": "air_store_id", "genre_name": "air_genre_name", "area_name": "air_area_name", "latitude": "air_latitude", "longitude": "air_longitude"})
hpg_store_info = hpg_store_info.rename(columns={"store_id": "hpg_store_id", "genre_name": "hpg_genre_name", "area_name": "hpg_area_name", "latitude": "hpg_latitude", "longitude": "hpg_longitude"})

store_relation = pd.merge(store_id_relation, air_store_info, on="air_store_id", how="left")
store_relation = pd.merge(store_relation, hpg_store_info, on="hpg_store_id", how="left")

store_relation.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   air_store_id    150 non-null    object 
 1   hpg_store_id    150 non-null    object 
 2   air_genre_name  150 non-null    object 
 3   air_area_name   150 non-null    object 
 4   air_latitude    150 non-null    float64
 5   air_longitude   150 non-null    float64
 6   hpg_genre_name  63 non-null     object 
 7   hpg_area_name   63 non-null     object 
 8   hpg_latitude    63 non-null     float64
 9   hpg_longitude   63 non-null     float64
dtypes: float64(4), object(6)
memory usage: 11.8+ KB
In [ ]:
sns.heatmap(store_relation.isna(), yticklabels=False, cbar=False)
Out[ ]:
<Axes: >
In [ ]:
store_relation = pd.read_csv("store_id_relation.csv")

store_relation = pd.merge(store_relation, air_store_info, on="air_store_id", how="left")
store_relation = pd.merge(store_relation, hpg_store_info, on="hpg_store_id")

air_store_info = air_store_info.rename(columns={"air_store_id": "store_id", "air_genre_name": "genre_name", "air_area_name": "area_name", "air_latitude": "latitude", "air_longitude": "longitude"})
hpg_store_info = hpg_store_info.rename(columns={"hpg_store_id": "store_id", "hpg_genre_name": "genre_name", "hpg_area_name": "area_name", "hpg_latitude": "latitude", "hpg_longitude": "longitude"})
In [ ]:
import geopy.distance
from geopy import Point

store_relation["air_point"] = store_relation.apply(lambda row: Point(latitude=row['air_latitude'], longitude=row['air_longitude']), axis=1)
store_relation["hpg_point"] = store_relation.apply(lambda row: Point(latitude=row['hpg_latitude'], longitude=row['hpg_longitude']), axis=1)
store_relation["distance"] = store_relation["air_latitude"]

for i in range(store_relation.shape[0]):
    store_relation.loc[i, "distance"] = geopy.distance.distance(store_relation.loc[i, "air_point"], store_relation.loc[i, "hpg_point"]).km

store_relation = store_relation.sort_values("distance").reset_index(drop=True)
store_relation.tail()
Out[ ]:
air_store_id hpg_store_id air_genre_name air_area_name air_latitude air_longitude hpg_genre_name hpg_area_name hpg_latitude hpg_longitude air_point hpg_point distance
58 air_d00161e19f08290b hpg_05fada27f04e4383 Dining bar Tōkyō-to Shinjuku-ku Kabukichō 35.693840 139.703549 International cuisine None None None 35.688589 139.731971 35 41m 37.8244s N, 139 42m 12.7778s E 35 41m 18.9216s N, 139 43m 55.0955s E 2.637691
59 air_c1ff20617c54fee7 hpg_4be4a5cb851e45af Izakaya Tōkyō-to Meguro-ku Kamimeguro 35.641463 139.698171 International cuisine Tōkyō-to Minato-ku Takanawa 35.637183 139.737998 35 38m 29.2664s N, 139 41m 53.4163s E 35 38m 13.8586s N, 139 44m 16.7929s E 3.638336
60 air_258ad2619d7bff9a hpg_e87d00be48cd91c6 Izakaya Tōkyō-to Kōtō-ku Tōyō 35.672854 139.817410 Japanese style Tōkyō-to Sumida-ku Tachibana 35.704960 139.828642 35 40m 22.2726s N, 139 49m 2.67492s E 35 42m 17.8562s N, 139 49m 43.1123s E 3.704582
61 air_48f4da6223571da4 hpg_832ba309e6699258 Italian/French Tōkyō-to Tachikawa-shi Izumichō 35.714014 139.407843 Italian Tōkyō-to Hachiōji-shi Ishikawamachi 35.677207 139.373870 35 42m 50.4511s N, 139 24m 28.2352s E 35 40m 37.9442s N, 139 22m 25.9306s E 5.112082
62 air_640cf4835f0d9ba3 hpg_e63863c8987bd064 Izakaya Tōkyō-to Ōta-ku Kamata 35.561257 139.716051 Japanese style Kanagawa-ken Yokohama-shi Nagatsutachō 35.512762 139.495733 35 33m 40.5245s N, 139 42m 57.784s E 35 30m 45.9439s N, 139 29m 44.6382s E 20.691947
In [ ]:
import plotly.express as px

fig = px.histogram(store_relation, x='distance')
fig.show()
In [ ]:
def get_delta(df):
    temp = deepcopy(df)
    
    temp["delta"] = temp["visit_datetime"] - temp["reserve_datetime"]
    temp = temp.rename(columns={"visit_datetime": "date"})
    temp['date'] = temp['date'].dt.date
    temp = temp.drop(columns=["reserve_datetime", "store_id"])

    def delta_days(x):
        if x.days < 1:
            return "<1"
        elif x.days <= 7:
            return "1 - 7"
        else:
            return ">7"


    temp["delta_days"] = temp["delta"].apply(delta_days)
    temp = temp.groupby(by=["date", "delta_days"]).reserve_visitors.agg(["sum", "count"]).reset_index().sort_values("date")

    fig = px.line(temp, x="date", y="count", color='delta_days')
    fig.show()
    fig = px.line(temp, x="date", y="sum", color='delta_days')
    fig.show()
In [ ]:
get_delta(air_reserve)
In [ ]:
get_delta(reserve_data)
In [ ]:
import plotly.express as px

temp = deepcopy(visit_data)

temp["store_id"] = temp["store_id"].str[:3]
temp = temp.groupby(by=["date", "store_id"]).sum().reset_index()

fig = px.line(temp, x="date", y="visitors", color='store_id')
fig.show()
In [ ]:
import plotly.express as px

temp = deepcopy(data)
temp["store_id"] = temp["store_id"].str[:3]

temp1 = deepcopy(temp)
temp1["store_id"] = "all"

temp = pd.concat([temp, temp1])

temp = temp.groupby(by=["date", "store_id"]).sum().reset_index()

fig = px.line(temp, x="date", y="visitors", color='store_id')
fig.show()
In [ ]:
len(visit_data['store_id'].unique())
Out[ ]:
14154
In [ ]:
len(data['store_id'].unique())
Out[ ]:
5519
In [ ]:
temp = deepcopy(reserve_data)
    
temp["delta"] = temp["visit_datetime"] - temp["reserve_datetime"]
temp = temp.rename(columns={"visit_datetime": "date"})
temp['date'] = temp['date'].dt.date
temp = temp.drop(columns=["reserve_datetime"])
temp = temp.groupby(by=["store_id", "date"]).sum().reset_index()
temp["date"] = pd.to_datetime(temp["date"])
data = pd.merge(data, temp, on=["store_id", "date"])
In [ ]:
data
Out[ ]:
store_id date visitors day_of_week holiday_flg genre_name area_name latitude longitude reserve_visitors delta
0 air_35512c42db0868da 2016-02-27 6 Saturday 0 Dining bar Tōkyō-to Musashino-shi Midorichō 35.717784 139.566260 12 0 days 01:00:00
1 air_ee3a01f0c71a769f 2016-01-04 61 Monday 0 Cafe/Sweets Shizuoka-ken Hamamatsu-shi Motoshirochō 34.710895 137.725940 2 0 days 04:00:00
2 air_ee3a01f0c71a769f 2016-01-08 21 Friday 0 Cafe/Sweets Shizuoka-ken Hamamatsu-shi Motoshirochō 34.710895 137.725940 3 2 days 05:00:00
3 air_ee3a01f0c71a769f 2016-01-09 57 Saturday 0 Cafe/Sweets Shizuoka-ken Hamamatsu-shi Motoshirochō 34.710895 137.725940 25 34 days 13:00:00
4 air_ee3a01f0c71a769f 2016-01-10 32 Sunday 0 Cafe/Sweets Shizuoka-ken Hamamatsu-shi Motoshirochō 34.710895 137.725940 5 4 days 09:00:00
... ... ... ... ... ... ... ... ... ... ... ...
758042 hpg_ffe96ca8c6ec10cf 2017-03-31 8 Friday 0 Italian Tōkyō-to Chūō-ku None 35.682504 139.773619 8 29 days 03:00:00
758043 hpg_ffe96ca8c6ec10cf 2017-04-05 4 Wednesday 0 Italian Tōkyō-to Chūō-ku None 35.682504 139.773619 4 13 days 10:00:00
758044 hpg_ffe96ca8c6ec10cf 2017-04-07 11 Friday 0 Italian Tōkyō-to Chūō-ku None 35.682504 139.773619 11 6 days 21:00:00
758045 hpg_ffe96ca8c6ec10cf 2017-04-15 3 Saturday 0 Italian Tōkyō-to Chūō-ku None 35.682504 139.773619 3 40 days 06:00:00
758046 hpg_ffe96ca8c6ec10cf 2017-04-24 16 Monday 0 Italian Tōkyō-to Chūō-ku None 35.682504 139.773619 16 5 days 22:00:00

758047 rows × 11 columns

In [ ]:
def samples(df):
    ids = random.choices(data.where(data["store_id"].str[:3] == "air")["store_id"].unique(), k=3)

    for id in ids:
        temp = df.where(df["store_id"] == id).dropna().reset_index(drop=True)
        temp.sort_values("date")

        fig = go.Figure()
        fig.add_trace(go.Scatter(x=temp["date"], y=temp["visitors"],
                    mode='lines',
                    name='visitors'))
        fig.add_trace(go.Scatter(x=temp["date"], y=temp["reserve_visitors"],
                    mode='lines',
                    name='reserve_visitors'))
        fig.update_layout(title=id,
                   xaxis_title='date')

        fig.show()
        print(id)
In [ ]:
samples(data)
air_63a750d8b4b6a976
air_d07e57b21109304a
air_3e93f3c81008696d
In [ ]:
temp = deepcopy(data)
temp["month"] = [date.month_name() for date in temp["date"]]
temp = temp.drop(columns=["store_id", "date", "holiday_flg", "genre_name", "area_name", "latitude", "longitude", "day_of_week"])
temp = temp.groupby('month').mean().reset_index().sort_values("visitors")

fig = px.bar(temp, x='month', y='visitors',
             color='visitors', height=400)
fig.show()
In [ ]:
fig = px.bar(data.groupby("holiday_flg")["visitors"].mean().reset_index().sort_values("visitors"), x='holiday_flg', y='visitors',
             color='visitors', height=400)
fig.show()
In [ ]:
def genre_area_stores(df):
    temp = deepcopy(df)
    temp = temp.groupby('genre_name').count().reset_index()
    temp = temp.drop(columns=["store_id", "latitude", "longitude"])
    temp = temp.rename(columns={"area_name": "stores"}).sort_values("stores")


    fig = px.bar(temp.tail(50), x='genre_name', y='stores',
                 color='stores', height=400)
    fig.show()

    temp = deepcopy(df)
    temp = temp.groupby('area_name').count().reset_index()
    temp = temp.drop(columns=["store_id", "latitude", "longitude"])
    temp = temp.rename(columns={"genre_name": "stores"}).sort_values("stores")


    fig = px.bar(temp.tail(50), x='area_name', y='stores',
                 color='stores', height=600)
    fig.show()
In [ ]:
genre_area_stores(air_store_info)
In [ ]:
genre_area_stores(hpg_store_info)
In [ ]:
genre_area_stores(data)
In [ ]:
def genre_area_visitors(df):
    temp = deepcopy(df.drop(columns=["store_id", "latitude", "longitude", "date", "day_of_week", "holiday_flg", "delta", "area_name", "reserve_visitors"]))
    temp = temp.groupby('genre_name').sum().reset_index().sort_values("visitors")


    fig = px.bar(temp.tail(50), x='genre_name', y='visitors',
                 color='visitors', height=400)
    fig.show()

    temp = deepcopy(df.drop(columns=["store_id", "latitude", "longitude", "date", "day_of_week", "holiday_flg", "delta", "genre_name", "reserve_visitors"]))
    temp = temp.groupby('area_name').sum().reset_index().sort_values("visitors")


    fig = px.bar(temp.tail(50), x='area_name', y='visitors',
                 color='visitors', height=600)
    fig.show()
In [ ]:
genre_area_visitors(data)
In [ ]:
# from shapely.geometry import Point
# import geopandas

# longitudes = data["longitude"].tolist()
# latitudes = data["latitude"].tolist()

# world = geopandas.read_file(geopandas.datasets.get_path("naturalearth_lowres"))
# japan = world[world.name == 'Japan']
# fig,ax = plt.subplots(figsize = (10,12))
# japan.plot(ax = ax)

# geometry = [Point(xy) for xy in zip(longitudes,latitudes)]
# geo_df = geopandas.GeoDataFrame(geometry = geometry)

# g = geo_df.plot(ax = ax, markersize = 20, color = 'red',marker = '*',label = 'Delhi')
# plt.show()
In [ ]:
profile = ProfileReport(data, title="Profiling Report")
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[ ]:

The Kernel crashed while executing code in the current cell or a previous cell. 

Please review the code in the cell(s) to identify a possible cause of the failure. 

Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. 

View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details.